import numpy as np
import pandas as pd
import math
import seaborn as sb
from pylab import rcParams
from numpy import genfromtxt
#Main file
file = 'Skin_NonSkin.txt';
X = pd.read_csv(file, sep='\t')
X.columns = ['blue', 'green', 'red', 'skin']
X.head()
#Presets for print, figures
%matplotlib inline
rcParams['figure.figsize']=5, 4
sb.set_style('whitegrid')
#Pairwise plot
sb.pairplot(X, hue = 'skin',
plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
size = 4)
#Dataset plot
import plotly as py
import plotly.graph_objs as go
trace1 = go.Scatter3d(
x=X['blue'],
y=X['green'],
z=X['red'],
text=['Red', 'Green', 'Blue'],
showlegend=True,
name='Skin',
mode='markers',
marker=dict(
size=1,
color=X['skin'], # set color to an array/list of desired values
colorscale='Viridis', # choose a colorscale
opacity=1
)
)
data = [trace1]
layout = go.Layout(
title='Skin(purp) vs Non skin(yellow)',
scene = dict(
xaxis = dict(
title='Red'),
yaxis = dict(
title='Green'),
zaxis = dict(
title='Blue'),),
)
fig = go.Figure(data=data, layout=layout)
py.offline.init_notebook_mode()
py.offline.iplot(fig, image='png')
#Heatmap of correlation coefficients
corrcoef = X.corr()
rcParams['figure.figsize']=5, 4
sb.heatmap(corrcoef, xticklabels=corrcoef.columns.values, yticklabels=corrcoef.columns.values)
mx=X.mean()
stdx=X.std()
mx, stdx
from IPython.display import Image
from IPython.core.display import HTML
import sklearn
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn import preprocessing
from matplotlib import pyplot as plt
pca = PCA()
X_pca = pca.fit_transform(X)
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
ev = pca.explained_variance_ratio_
cs = np.cumsum(ev)/sum(ev)
plt.plot(['EigV 1', 'EigV 2', 'EigV 3'], cs[:3])
comps = pd.DataFrame(pca.components_,columns=X.columns)
plt.plot(['Blue', 'Green', 'Red'], pca.components_[0, :3], 'b', ['Blue', 'Green', 'Red'], pca.components_[1, :3], 'g')
sb.heatmap(comps)
from numpy import array
mu = X.mean(0)
#C = np.cov(X - mu, rowvar=False)
#d, u = np.linalg.eigh(C)
#U = u.T[::-1]
Z = np.dot(X - mu, comps[:2].T)
comps
rcParams['figure.figsize']=10, 9
plt.scatter(Z[:, 0], Z[:, 1], c=X['skin'], s=1)
plt.title("Projection on 2 eigenvectors")
plt.xlabel("EigV 2")
plt.ylabel("EigV 1")
Xhat = np.dot(Z, pca.components_[:2,:2])
Xhat += [mu['blue'],mu['green']]
plt.scatter(Xhat[:, 0], Xhat[:, 1], c=X['skin'], s=1)
plt.title("Reconstruction of the original data")
plt.xlabel("Blue")
plt.ylabel("Green")
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import metrics
from sklearn.metrics import confusion_matrix
np.set_printoptions(precision=4,suppress = True)
Z2 = preprocessing.scale(X[['blue', 'green', 'red']])
X_train, X_test, y_train, y_test = train_test_split(Z2, X['skin'], test_size = .33, random_state=17)
Z3 = preprocessing.scale(Z)
X2_train, X2_test, y2_train, y2_test = train_test_split(Z3, X['skin'], test_size = .33, random_state=17)
#Original data
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)
y_expect = y_test
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_expect, y_pred))
print(confusion_matrix(y_expect, y_pred))
knnc1 = metrics.accuracy_score(y_expect, y_pred)
#PCA data
clf = neighbors.KNeighborsClassifier()
clf.fit(X2_train, y2_train)
y_expect = y2_test
y_pred = clf.predict(X2_test)
print(metrics.classification_report(y_expect, y_pred))
print(confusion_matrix(y_expect, y_pred))
knnc2 = metrics.accuracy_score(y_expect, y_pred)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
#Original data
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_expect = y_test
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_expect, y_pred))
print(confusion_matrix(y_expect, y_pred))
qdc1 = metrics.accuracy_score(y_expect, y_pred)
#PCA data
clf = QuadraticDiscriminantAnalysis()
clf.fit(X2_train, y2_train)
y_expect = y2_test
y_pred = clf.predict(X2_test)
print(metrics.classification_report(y_expect, y_pred))
print(confusion_matrix(y_expect, y_pred))
qdc2 = metrics.accuracy_score(y_expect, y_pred)
#Original data
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_expect = y_test
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_expect, y_pred))
print(confusion_matrix(y_expect, y_pred))
ldc1 = metrics.accuracy_score(y_expect, y_pred)
#PCA data
clf = LinearDiscriminantAnalysis()
clf.fit(X2_train, y2_train)
y_expect = y2_test
y_pred = clf.predict(X2_test)
print(metrics.classification_report(y_expect, y_pred))
print(confusion_matrix(y_expect, y_pred))
ldc2 = metrics.accuracy_score(y_expect, y_pred)
trace1 = go.Bar(
x=['Knnc', 'Qdc', 'Ldc'],
y=[knnc1*100, qdc1*100, ldc1*100],
name='Original data'
)
trace2 = go.Bar(
x=['Knnc', 'Qdc', 'Ldc'],
y=[knnc2*100, qdc2*100, ldc2*100],
name='PCA data'
)
data2 = [trace1, trace2]
layout = go.Layout(
barmode='group'
)
fig = go.Figure(data=data2, layout=layout)
py.offline.init_notebook_mode()
py.offline.iplot(fig, image='png')